import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import sklearn
import seaborn as sns
import os
import re
import plotly.express as px
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from pyLDAvis import sklearn as sklearn_lda
import pickle
import pyLDAvis
import warnings
warnings.filterwarnings('ignore')
qc_questions = pd.read_csv('QC-ARC-Questions.tsv',delimiter='\t',encoding='utf-8')
qc_taxonomy = pd.read_csv('QC-ARC-Taxonomy.tsv', delimiter='\t', encoding='utf-8')
qc_questions.head()
qc_questions['QCLabel'].value_counts()
qc_questions.shape[0]
For this stage, I will be trying out a variety of visualization techniques on the K12 QC ARC question banks based on their categories and the frequencies of similar questions across different subjects.
To perform a WordCloud visualization on the questions, we will be running all the questions through the following data pipeline:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def port_stem(s):
p_stemmer = nltk.PorterStemmer()
return [p_stemmer.stem(word) for word in s]
tokenized_questions = qc_questions['Question'].str.lower().apply(nltk.word_tokenize)
tokenized_questions = tokenized_questions.apply(lambda tokens: [w for w in tokens if not w in stop_words])
tokenized_q_pos = tokenized_questions.apply(nltk.pos_tag)
tokenized_nouns = tokenized_q_pos.apply(lambda tags: [word for word,pos in tags if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')])
tokenized_nouns = tokenized_nouns.apply(port_stem).apply(lambda nouns: [noun for noun in nouns if len(noun) > 1])
retokenized = tokenized_nouns.apply(lambda tokens: ' '.join(tokens))
qtext = ''
for token in retokenized:
qtext += token + ' '
wordcloud = WordCloud(background_color="white").generate(qtext)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
category = qc_questions['QCLabel'].apply(lambda x: x.split('_')[0]).apply(lambda x: 'LIFE' if x == 'OTHER, LIFE' else x)
qc_questions['Subject'] = category
plt.title('Distribution of Subjects Per Exam Question')
sns.countplot(x='Subject', data=qc_questions, palette="Set3", order = qc_questions['Subject'].value_counts().index);
As an initial next step, to visualization how each exam question is phrased and the simillarity/difference among them across different subjects, we will compute the TF-IDF values as part of our feature extraction process and attempt to run an initial MiniBatch K-means clustering algorithm on the data.
As a first step, we will obtain the feature matrix on all the questions using the TfidfVectorizer.
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
text = text.lower()
text = REPLACE_BY_SPACE_RE.sub(' ', text)
text = BAD_SYMBOLS_RE.sub('', text)
text = text.replace('x', '')
text = ' '.join(word for word in text.split() if word not in STOPWORDS)
return text
qc_questions['Q_cleaned'] = qc_questions['Question'].apply(clean_text).str.replace('\d+', '')
vec = TfidfVectorizer(stop_words="english")
vec.fit(qc_questions['Q_cleaned'].values)
features = vec.transform(qc_questions['Q_cleaned'].values)
In order to determine how many clusters we would like our K-means algorithm to aim for, we can start with a random guess and gradually increase the number of clusters.
cls_s = []
for i in range(5, 15):
cls = MiniBatchKMeans(n_clusters=i, random_state=0)
cls.fit(features)
cls_s.append(cls)
for cls in cls_s:
cls.predict(features)
To visualize, we’ll plot the features in a 3D space. As we know the dimension of features that we obtained from TfIdfVectorizer could be quite large, we need to reduce the dimension before we can plot. For this, we’ll ues PCA to transform our high dimensional features into 3 dimensions.
Note: As we hover over individual data points (exam questions) in the 3D scatterplot below, we can see the corresponding subject for the question.
pca = PCA(n_components=3)
reduced_features = pca.fit_transform(features.toarray())
reduced_cluster_centers = pca.transform(cls.cluster_centers_)
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Scatter3d(x=reduced_features[:,0], y=reduced_features[:,1], z=reduced_features[:,2],
mode='markers',text=qc_questions['Subject'], marker=dict(size=4,color=cls_s[0].predict(features),
colorscale='Rainbow',opacity=0.6)),
go.Scatter3d(x=reduced_cluster_centers[:,0], y=reduced_cluster_centers[:,1], z=reduced_cluster_centers[:,2],
mode='markers',marker=dict(size=6, symbol='x', color='rgb(0, 0, 0)'))
]
)
fig.update_layout(
scene = dict(
xaxis_title='PC1',
yaxis_title='PC2',
zaxis_title='PC3'
),
title='K-Means Clustering of Exam Questions (5 Clusters)'
)
fig.show()
fig = go.Figure(data=[
go.Scatter3d(x=reduced_features[:,0], y=reduced_features[:,1], z=reduced_features[:,2],
mode='markers',text=qc_questions['Subject'], marker=dict(size=4,color=cls_s[5].predict(features),
colorscale='Rainbow',opacity=0.6)),
go.Scatter3d(x=reduced_cluster_centers[:,0], y=reduced_cluster_centers[:,1], z=reduced_cluster_centers[:,2],
mode='markers',marker=dict(size=6, symbol='x', color='rgb(0, 0, 0)'))
]
)
fig.update_layout(
scene = dict(
xaxis_title='PC1',
yaxis_title='PC2',
zaxis_title='PC3'
),
title='K-Means Clustering of Exam Questions (10 Clusters)'
)
fig.show()
fig = go.Figure(data=[
go.Scatter3d(x=reduced_features[:,0], y=reduced_features[:,1], z=reduced_features[:,2],
mode='markers', text=qc_questions['Subject'], marker=dict(size=4,color=cls_s[-1].predict(features),
colorscale='Rainbow',opacity=0.6)),
go.Scatter3d(x=reduced_cluster_centers[:,0], y=reduced_cluster_centers[:,1], z=reduced_cluster_centers[:,2],
mode='markers',marker=dict(size=6, symbol='x', color='rgb(0, 0, 0)'))
]
)
fig.update_layout(
scene = dict(
xaxis_title='PC1',
yaxis_title='PC2',
zaxis_title='PC3'
),
title='K-Means Clustering of Exam Questions (15 Clusters)'
)
fig.show()
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')
count_data = count_vectorizer.fit_transform(qc_questions['Question'])
def count_topics(model, count_vectorizer, n_top_words):
words = count_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(model.components_):
print("\nTopic #%d:" % topic_idx)
print(" ".join([words[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
number_topics = 20
number_words = 10
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)
print("Topics found via LDA:")
count_topics(lda, count_vectorizer, number_words)
LDAvis_data_filepath = 'ldavis_prepared_'+str(number_topics)
LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)
pyLDAvis.save_html(LDAvis_prepared, 'ldavis_prepared_'+ str(number_topics) +'.html')
pyLDAvis.display(LDAvis_prepared)
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
MAX_WORD_COUNT = 5000
MAX_SEQ_SIZE = 50
EMBEDDING_DIM = 50
EPOCHS = 10
BATCH_SIZE = 32
category = qc_questions['QCLabel'].apply(lambda x: x.split('_')[0]).apply(lambda x: 'LIFE' if x == 'OTHER, LIFE' else x)
qc_questions['Subject'] = category
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
text = text.lower()
text = REPLACE_BY_SPACE_RE.sub(' ', text)
text = BAD_SYMBOLS_RE.sub('', text)
text = text.replace('x', '')
text = ' '.join(word for word in text.split() if word not in STOPWORDS)
return text
qc_questions['Q_cleaned'] = qc_questions['Question'].apply(clean_text).str.replace('\d+', '')
tokenizer = Tokenizer(num_words=MAX_WORD_COUNT, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(qc_questions['Q_cleaned'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
X = tokenizer.texts_to_sequences(qc_questions['Q_cleaned'].values)
X = pad_sequences(X, maxlen=MAX_SEQ_SIZE)
print('Shape of Design Matrix:', X.shape)
y = pd.get_dummies(qc_questions['Subject']).values
print('Shape of Response Vector:', y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)
model = Sequential()
model.add(Embedding(MAX_WORD_COUNT, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(LSTM(50))
model.add(Dense(9, activation='relu'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
history = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.1)
accr = model.evaluate(X_test,y_test)
print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format(accr[0],accr[1]))
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show();